{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kaggle-房价预测（进阶版）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 1 检视数据源"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_tra = pd.read_csv(\"input/train.csv\", index_col=0)\n",
    "data_tes = pd.read_csv(\"input/test.csv\", index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Corner</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "Id                                                                    \n",
       "1           60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "2           20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "3           60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "4           70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "5           60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "   LandContour Utilities LotConfig    ...     PoolArea PoolQC Fence  \\\n",
       "Id                                    ...                             \n",
       "1          Lvl    AllPub    Inside    ...            0    NaN   NaN   \n",
       "2          Lvl    AllPub       FR2    ...            0    NaN   NaN   \n",
       "3          Lvl    AllPub    Inside    ...            0    NaN   NaN   \n",
       "4          Lvl    AllPub    Corner    ...            0    NaN   NaN   \n",
       "5          Lvl    AllPub       FR2    ...            0    NaN   NaN   \n",
       "\n",
       "   MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "Id                                                                         \n",
       "1          NaN       0      2    2008        WD         Normal     208500  \n",
       "2          NaN       0      5    2007        WD         Normal     181500  \n",
       "3          NaN       0      9    2008        WD         Normal     223500  \n",
       "4          NaN       0      2    2006        WD        Abnorml     140000  \n",
       "5          NaN       0     12    2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_tra.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 2 合并数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001CE5CEA4358>,\n",
       "        <matplotlib.axes._subplots.AxesSubplot object at 0x000001CE5CF1D390>]],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAGvRJREFUeJzt3XuQXOV55/Hvz4hbGIO4zsqSYsGiJbCr4jZF5CLrnSDH4baI7EICqzICyyVXgrO4UMUWJpWsd51dkS2byzqFrTLGwou5LJiFAI5NhLo2VIIwN0uArDCwMprSGIEBYYFhI/vZP87b0Gr1TPfMnOk+8+r3qerqc95ze+bMmWfefvs971FEYGZm+fpArwMwM7Op5URvZpY5J3ozs8w50ZuZZc6J3swsc070ZmaZc6LvAklbJH2s13GYVZ2kfy1pc6/jyI0TfWYk/bakdZJ2SNrS63jMxiMi/i4ijut1HLlxop+GJA1Kqo2y+C3gm8CfdC8is8mTNKPXMeTKib6LJO0v6TpJ29LrOkn7Nyz/nKSRtOxTkkLSseM5RkQ8FhHfBl4s/Qcwm4DUdHmVpOckvS7pZkkHpArLsKTPS/opcHO9rGHbuZK+K+kVST+T9NWGZZ+UtCnt8/uSPtyTH3AacKLvrquBhcBJwInAacCfAkg6E7gS+BhwLPBvehSj2VRYAvwu8M+Bf0G67oF/BhwGfBhY3riBpH2A+4GfAPOA2cDtadn5wBeAfwccCfwdcNsU/wzTlhN9dy0B/nNEbI+IV4AvAp9Iy34fuDkino2It9Mys1x8NSK2RsRrwF8AF6fyXwF/HhHvRsQvmrY5DfgQ8CcR8VZEvBMRj6Rlnwb+W0RsiohdwH8FTnKtvjUn+u76EEXtpO4nqay+bGvDssZpJK2U9IakNyhqOb9Vn09lZlXWeD03XvevRMQ7o2wzF/hJSuTNPgxc33D9vwaIotZvTZzou2sbxQVa9+upDGAEmNOwbG7jhhGxKiJmRsRM4Fzgkfp8KjOrssbrufG6H2v43K3Ar4/yJe1W4NONfwMRcWBE/H1J8WbFib67bgP+VNKRko4A/gz4n2nZncBlko6X9Gtp2bhJ+oCkA4B9i1kdIGm/MoI3m4TLJc2RdBhF2/odHWzzGEUFaJWkg9K1fHpa9jXgKkn/EkDSIZIunJLIM+BE311fAh4HNgAbgSdTGRHxPeAGYB0wBPxD2ubdcR7jo8AvgAcpak6/AH4w2cDNJuk7FNfhi+n1pXYbRMQvgX9L0TnhJWAY+IO07B7gGuB2SW8CzwBnTUnkGZAfPFJNko6nuHj3H6WN0mxaSDfufSoi/rbXseytXKOvEEm/J2k/SYdS1Fb+2knezCbLib5aPg28ArwA/BL4w96GY2Y5cNONmVnmXKM3M8tcJQYROuKII2LevHldP+5bb73FQQcd1PXjToRjbe+JJ554NSKO7PqBJ+CII46II488slK/0ypeY45pbB1f8xHR89epp54avbBu3bqeHHciHGt7wONRgeu5k9epp55aud9p1eKJcEztdHrNu+nGzCxzTvRmZplzojczy5wTvZlZ5pzozcwy50RvZpY5J3ozs8w50ZuZZc6J3swsc5UYAmFvM2/lA+Pe5ltnVuOWa6umiVxTW1adMwWRWBW5Rm9mljknejOzzDnRmzWRdJykpxteb0r6rKTDJD0k6fn0fmhaX5JukDQkaYOkU3r9M5g1cqI3axIRmyPipIg4CTgVeBu4B1gJrI2I+cDaNA/FQ6nnp9dy4MbuR202Oid6s7EtAl6IiJ8Ai4E1qXwNcH6aXgzckkaOfRSYKWlW90M1a829bszGdhFwW5ruj4gRgIgYkXRUKp8NbG3YZjiVjTTuSNJyiho//f397Ny5k1qtVkqQKxaM/xnyzccuM56yOKZyONGbjULSfsB5wFXtVm1RtsfDmCNiNbAaYGBgIPr6+hgcHJxsmABcOpHulUt2P3atVistnrI4pnK46cZsdGcBT0bEy2n+5XqTTHrfnsqHgbkN280BtnUtSrM2nOjNRncx7zfbANwHLE3TS4F7G8ovSb1vFgI76k08ZlXgphuzFiT9GvA7wKcbilcBd0paBrwEXJjKHwTOBoYoeuhc1sVQzdpyojdrISLeBg5vKvsZRS+c5nUDuLxLoZmNm5tuzMwy50RvZpY5J3ozs8w50ZuZZc6J3swsc070ZmaZc6I3M8ucE72ZWeac6M3MMudEb2aWOSd6M7PMOdGbmWXOid7MLHNO9GZmmXOiNzPLXMeJXtI+kp6SdH+aP1rSeknPS7ojPV8TSfun+aG0fN7UhG5mZp0YT43+CmBTw/w1wLURMR94HViWypcBr0fEscC1aT0zM+uRjhK9pDnAOcA30ryAM4C70iprgPPT9OI0T1q+KK1vZmY90GmN/jrgc8Cv0vzhwBsRsSvNDwOz0/RsYCtAWr6DpkeymZlZ97R9Zqykc4HtEfGEpMF6cYtVo4NljftdDiwH6O/vp1ardRJvqXbu3NmT465YsKv9Sk22v7aD/3HrvePaZsHsQ8Z9nDL06ryaWWudPBz8dOA8SWcDBwAHU9TwZ0qakWrtc4Btaf1hYC4wLGkGcAjwWvNOI2I1sBpgYGAgBgcHJ/mjjF+tVqMXx7105QPj3mbFgl18eeP4nuW+ZcnguI9Thl6d1zJJmknRVPmvKCoqnwQ2A3cA84AtwO9HxOupafJ64GzgbeDSiHiyB2GbtdS26SYiroqIORExD7gIeDgilgDrgAvSakuBenXzvjRPWv5wROxRozeruOuBv4mI3wBOpOiIsBJYmzogrE3zAGcB89NrOXBj98M1G91k+tF/HrhS0hBFG/xNqfwm4PBUfiXv/zGYTQuSDgY+SrqmI+L/RcQb7N7RoLkDwi1ReJTi0+6sLodtNqpxtQVERA2opekXgdNarPMOcGEJsZn1yjHAK8DNkk4EnqDoXtwfESMAETEi6ai0/nsdEJJ654SRxp02fy9V5ncZE/nep/nYVfxuxTGVY3yNvmZ7hxnAKcAfR8R6Sdcz9ifTjjogNH8v1dfXV9p3GRP53qf5O5wqfrfimMrhIRDM9jQMDEfE+jR/F0Xif7neJJPetzesP7dh+8bOCWY950Rv1iQifgpslXRcKloEPMfuHQ2aOyBcosJCYEe9icesCtx0Y9baHwO3pjGcXgQuo6gY3SlpGfAS738X9SBF18ohiu6Vl3U/XLPROdGbtRARTwMDLRYtarFuAJdPeVBmE+SmGzOzzDnRm5llzonezCxzTvRmZplzojczy5wTvZlZ5pzozcwy50RvZpY5J3ozs8w50ZuZZc6J3swsc070ZmaZc6I3M8ucE72ZWeac6M3MMudEb2aWOSd6M7PMOdGbmWXOid7MLHNO9GYtSNoiaaOkpyU9nsoOk/SQpOfT+6GpXJJukDQkaYOkU3obvdnunOjNRvfbEXFSRNQfEr4SWBsR84G1aR7gLGB+ei0Hbux6pGZjcKI369xiYE2aXgOc31B+SxQeBWZKmtWLAM1acaI3ay2AH0h6QtLyVNYfESMA6f2oVD4b2Nqw7XAqM6uEGb0OwKyiTo+IbZKOAh6S9OMx1lWLsthjpeIfxnKA/v5+du7cSa1WKyXYFQt2jXub5mOXGU9ZHFM5nOjNWoiIbel9u6R7gNOAlyXNioiR1DSzPa0+DMxt2HwOsK3FPlcDqwEGBgair6+PwcHBUuK9dOUD495my5Ldj12r1UqLpyyOqRxuujFrIukgSR+sTwMfB54B7gOWptWWAvem6fuAS1Lvm4XAjnoTj1kVuEZvtqd+4B5JUPyNfCci/kbSD4E7JS0DXgIuTOs/CJwNDAFvA5d1P2Sz0TnRmzWJiBeBE1uU/wxY1KI8gMu7EJrZhLjpxswsc070ZmaZa5voJR0g6TFJP5L0rKQvpvKjJa1Pt4PfIWm/VL5/mh9Ky+dN7Y9gZmZj6aRG/y5wRkScCJwEnJl6FlwDXJtuB38dWJbWXwa8HhHHAtem9czMrEfaJvp0W/fONLtvegVwBnBXKm++Hbx+m/hdwCKl7gtmZtZ9HbXRS9pH0tMUN4g8BLwAvBER9dvxGm/5fu928LR8B3B4mUGbmVnnOupeGRG/BE6SNBO4Bzi+1WrpfUK3g/filuJe3co8kdvV+w8c/3a9uk17Ot4ibpazcfWjj4g3JNWAhRQj9M1ItfbGW77rt4MPS5oBHAK81mJfu90O3otbint1K/NEbldfsWAXX944vtsemm9x75bpeIu4Wc466XVzZKrJI+lA4GPAJmAdcEFarfl28Ppt4hcAD6cbSszMrAc6qSLOAtZI2ofiH8OdEXG/pOeA2yV9CXgKuCmtfxPwbUlDFDX5i6YgbjMz61DbRB8RG4CTW5S/SDGiX3P5O7w/BoiZmfWY74w1M8ucE72ZWeac6M3MMudEb2aWOSd6M7PMOdGbmWXOid7MLHNO9GZmmXOiNzPLnBO92SjS8NxPSbo/zfupajYtOdGbje4KigH86vxUNZuWnOjNWpA0BzgH+EaaF36qmk1T4xvg3GzvcR3wOeCDaf5wOnyqmqT6U9Vebdxh88N2ynxAy0QeZtN87Co+MMYxlcOJ3qyJpHOB7RHxhKTBenGLVcf1VLXmh+309fWV9oCWiTzMpvnBNFV8YIxjKocTvdmeTgfOk3Q2cABwMEUNf1JPVTPrFbfRmzWJiKsiYk5EzKN4cM7DEbEEP1XNpiknerPOfR64Mj097XB2f6ra4an8SmBlj+Iza8lNN2ZjiIgaUEvTfqqaTUuu0ZuZZc6J3swsc266MdtLzWvqkrliwa6OumluWXXOVIVkU8Q1ejOzzDnRm5llzonezCxzTvRmZplzojczy5wTvZlZ5pzozcwy50RvZpY5J3ozs8z5ztiMNd/52Anf9WiWH9fozcwy50RvZpY5J3ozs8w50ZuZZc6J3swsc20TvaS5ktZJ2iTpWUlXpPLDJD0k6fn0fmgql6QbJA1J2iDplKn+IczMbHSd1Oh3ASsi4nhgIXC5pBMoHoC8NiLmA2t5/4HIZwHz02s5cGPpUZuZWcfaJvqIGImIJ9P0z4FNwGxgMbAmrbYGOD9NLwZuicKjwExJs0qP3GyKSDpA0mOSfpQ+xX4xlR8taX36FHuHpP1S+f5pfigtn9fL+M2ajeuGqXQBnwysB/ojYgSKfwaSjkqrzQa2Nmw2nMpGmva1nKLGT39/P7VabfzRT9LOnTt7ctwVC3aNe5v+Aye23XiVcT56dV5L9C5wRkTslLQv8Iik7wFXAtdGxO2SvgYso/jEugx4PSKOlXQRcA3wB70K3qxZx4leUh9wN/DZiHhT0qirtiiLPQoiVgOrAQYGBmJwcLDTUEpTq9XoxXE7eS5nsxULdvHljVN/I/OWJYOT3kevzmtZIiKAnWl23/QK4AzgP6TyNcB/okj0i9M0wF3AVyUp7ces5zrKHKlWczdwa0R8NxW/LGlWqs3PAran8mFgbsPmc4BtZQVs1g2S9gGeAI4F/gp4AXgjIuofq+qfVKHhU2xE7JK0AzgceLVpn7t9ii3zk08Zn/Y6/dTYzU9rVfx0WMWY2mmb6FVU3W8CNkXEVxoW3QcsBVal93sbyj8j6XbgN4Ed9SYes+kiIn4JnCRpJnAPcHyr1dL7hD7F9vX1lfbJZyKfEpt1+qmxjE99narip8MqxtROJzX604FPABslPZ3KvkCR4O+UtAx4CbgwLXsQOBsYAt4GLis1YrMuiog3JNUoepzNlDQj1eobP6nWP8UOS5oBHAK81ot4zVppm+gj4hFa11gAFrVYP4DLJxmXWc9IOhL4p5TkDwQ+RvEF6zrgAuB29vwUuxT4h7T8YbfPW5V4mGKzPc0C1qR2+g8Ad0bE/ZKeA26X9CXgKYomTdL7tyUNUdTkL+pF0GajcaKfhImM927VFxEbKLoRN5e/CJzWovwd3m+6NKscj3VjZpY5J3ozs8w50ZuZZc6J3swsc070ZmaZc6I3M8ucE72ZWeac6M3MMudEb2aWOSd6M7PMOdGbmWXOid7MLHNO9GZmmXOiNzPLnBO9mVnmnOjNzDLnRG9mljknejOzzDnRm5llzonerImkuZLWSdok6VlJV6TywyQ9JOn59H5oKpekGyQNSdog6ZTe/gRmu3OiN9vTLmBFRBwPLAQul3QCsBJYGxHzgbVpHuAsYH56LQdu7H7IZqNzojdrEhEjEfFkmv45sAmYDSwG1qTV1gDnp+nFwC1ReBSYKWlWl8M2G5UTvdkYJM0DTgbWA/0RMQLFPwPgqLTabGBrw2bDqcysEmb0OgCzqpLUB9wNfDYi3pQ06qotyqLF/pZTNO3Q39/Pzp07qdVqpcS6YsGuSe+j/8DO9lNWzJ0o8xyVpYoxteNEb9aCpH0pkvytEfHdVPyypFkRMZKaZran8mFgbsPmc4BtzfuMiNXAaoCBgYHo6+tjcHCwlHgvXfnApPexYsEuvryxfUrYsmRw0sfqVK1WK+0claWKMbXjphuzJiqq7jcBmyLiKw2L7gOWpumlwL0N5Zek3jcLgR31Jh6zKnCN3mxPpwOfADZKejqVfQFYBdwpaRnwEnBhWvYgcDYwBLwNXNbdcM3G5kRv1iQiHqF1uzvAohbrB3D5lAZlNgluujEzy5wTvZlZ5pzozcwy50RvZpY5J3ozs8y1TfSSvilpu6RnGso8ip+Z2TTRSY3+W8CZTWUexc/MbJpom+gj4v8ArzUVexQ/M7NpYqI3TO02ip+kdqP47XE7ePMAT70YJGiygxOVMZBUpzodcGqyyvg9TMdBn8xyVvadsR2N4gd7DvDUi0GCJjs4URkDSXWq0wGnJquMAaum46BPVTKvi9eV7R0m2uvm5XqTzERG8TMzs+6ZaKL3KH5mZtNE27YASbcBg8ARkoaBP8ej+GVros0GW1adU3IkZlaWtok+Ii4eZZFH8TPbC02kMuCKQG/5zlgzs8w50ZuZZc6J3swsc070ZmaZc6I3M8ucE72ZWeac6M1a8PDclhMnerPWvoWH57ZMONGbteDhuS0nUz8colk+JjU8d/PQ3KMN59zN4a8bTeVQ2BMdtrqKQ15XMaZ2nOjNJq+j4bmbh+bu6+trOZxzN4e/bjSVQ2FPdPjrKg55XcWY2nHTjVnnPDy3TUtO9Gad8/DcNi256casBQ/PbTlxok/8+DZr5OG5LSduujEzy5wTvZlZ5pzozcwy50RvZpY5J3ozs8w50ZuZZc6J3swsc070ZmaZ8w1TZjblJnJD4pZV50xBJHsn1+jNzDLnRG9mljknejOzzDnRm5llzonezCxz7nVjpWjsVbFiwa6OHofnXhVm3eEavZlZ5pzozcwy56YbM6ukeSsf6LgZsM7Nga1lmeg7vQtvvBeRmdl05KYbM7PMTUmNXtKZwPXAPsA3ImLVVBzHprfcxj/xdW9VVXqil7QP8FfA7wDDwA8l3RcRz01kfxNJBmbdVvZ1b1amqajRnwYMRcSLAJJuBxYDvuAtZ77uK6AbFcNuf7dXxqdYRUQJoTTsULoAODMiPpXmPwH8ZkR8pmm95cDyNHscsLnUQDpzBPBqD447EY61vQ9HxJE9OG5H132La/5nVOt3WsVrzDGNraNrfipq9GpRtsd/k4hYDayeguN3TNLjETHQyxg65Vgrr+1133zNV+08VS0ecExlmYpeN8PA3Ib5OcC2KTiOWZX4urfKmopE/0NgvqSjJe0HXATcNwXHMasSX/dWWaU33UTELkmfAb5P0c3smxHxbNnHKUlPm47GybFW2ASv+6qdp6rFA46pFKV/GWtmZtXiO2PNzDLnRG9mlrm9JtFL+qak7ZKeaSi7UNKzkn4lqTLdpUaJ9b9L+rGkDZLukTSzlzHWjRLrf0lxPi3pB5I+1MsYq0bSmZI2SxqStLKE/c2VtE7SpnQ9X5HKD5P0kKTn0/uhqVySbkjH3yDplIZ9LU3rPy9paUP5qZI2pm1ukKSxjtGw3T6SnpJ0f5o/WtL6tP4d6YtrJO2f5ofS8nkN+7gqlW+W9LvtzuNox0jLZkq6K/0tbZL0kSqcpykXEXvFC/gocArwTEPZ8RQ3rtSAgV7H2CbWjwMz0vQ1wDW9jnOMWA9umP6PwNd6HWdVXhRf1L4AHAPsB/wIOGGS+5wFnJKmPwj8I3AC8JfAylS+sn7NAGcD36Po+78QWJ/KDwNeTO+HpulD07LHgI+kbb4HnJXKWx6jIbYrge8A96f5O4GL0vTXgD9M039Uv04oeizdkaZPSOdof+DodO72Ges8jnaMNL8G+FSa3g+YWYXzNOXXXa8v/C7/kc1rTEgN5TUqlOjHijUt+z3g1l7H2GGsVwE39jrGqrxSEvh+0/m5quRj3Esx5s5mYFYqmwVsTtNfBy5uWH9zWn4x8PWG8q+nslnAjxvK31tvtGOk+TnAWuAM4P6U/F7l/QrLe+eCorfSR9L0jLSems9Pfb3RzmObYxwM/F9SJ5Tmn79X56kbr72m6SYzn6SoLVSWpL+QtBVYAvxZr+OpkNnA1ob54VRWitTkcTKwHuiPiBGA9H5UmxjGKh8eJebRjgFwHfA54Fdp/nDgjYjY1WI/7x07Ld+R1h9vrGMd4xjgFeDm1Jz0DUkHVeA8TTkn+mlG0tXALuDWXscyloi4OiLmUsT5mXbr70U6GiJkQjuW+oC7gc9GxJsTiGG85WPFci6wPSKe6OC4ZcY01jFmUDQz3hgRJwNvUTSjjGbKz1O3ONFPI+lLn3OBJZE+A04D3wH+fa+DqJApGSpB0r4USf7WiPhuKn5Z0qy0fBawvU0MY5XPGSXm0Y5xOnCepC3A7RTNN9cBMyXNaLGf946dlh8CvDaBWF9tc4zhiFif5u+iSPy9PE9d4UQ/Tah4qMXngfMi4u1exzMWSfMbZs8DftyrWCqo9KESUs+Om4BNEfGVhkX3AfUeIUsp2u7r5ZekXiULgR2pOeH7wMclHZp6hXycon17BPi5pIXpWJc07WuPY0TEVRExJyLmpZ/x4YhYAqwDLhglpvp+LkjrRyq/KPXKORqYT/GFZ8vzmLZpeYyI+CmwVdJxadkiimGke3aeuqabXwj08gXcBowA/0Txn3cZxZeaw8C7wMs0fLlTwViHKNoFn06vSvRkGSXWu4FngA3AXwOzex1nlV4UvTn+kaLXyNUl7O+3KJoINjRcH2dTtFevBZ5P74el9UXxkJQXgI00dESg+P5nKL0uaygfSL/TF4Cv8v5d9S2P0RTfIO/3ujmGIlEPAf8L2D+VH5Dmh9LyYxq2vzoddzOpF8tY53G0Y6RlJwGPp3P1vyl6zVTiPE3ly0MgmJllzk03ZmaZc6I3M8ucE72ZWeac6M3MMudEb2aWOSd6M7PMOdGbmWXu/wOgcMXqEUEKvQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1ce5ce80cc0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "prices = pd.DataFrame({\"price\" : data_tra[\"SalePrice\"], \"log+1\" : np.log1p(data_tra[\"SalePrice\"])})\n",
    "prices.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = np.log1p(data_tra.pop(\"SalePrice\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Corner</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 79 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "Id                                                                    \n",
       "1           60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "2           20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "3           60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "4           70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "5           60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "   LandContour Utilities LotConfig      ...       ScreenPorch PoolArea PoolQC  \\\n",
       "Id                                      ...                                     \n",
       "1          Lvl    AllPub    Inside      ...                 0        0    NaN   \n",
       "2          Lvl    AllPub       FR2      ...                 0        0    NaN   \n",
       "3          Lvl    AllPub    Inside      ...                 0        0    NaN   \n",
       "4          Lvl    AllPub    Corner      ...                 0        0    NaN   \n",
       "5          Lvl    AllPub       FR2      ...                 0        0    NaN   \n",
       "\n",
       "   Fence MiscFeature MiscVal  MoSold  YrSold  SaleType  SaleCondition  \n",
       "Id                                                                     \n",
       "1    NaN         NaN       0       2    2008        WD         Normal  \n",
       "2    NaN         NaN       0       5    2007        WD         Normal  \n",
       "3    NaN         NaN       0       9    2008        WD         Normal  \n",
       "4    NaN         NaN       0       2    2006        WD        Abnorml  \n",
       "5    NaN         NaN       0      12    2008        WD         Normal  \n",
       "\n",
       "[5 rows x 79 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#合并数据集\n",
    "data_all = pd.concat([data_tra, data_tes], axis=0)\n",
    "data_all.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>...</th>\n",
       "      <th>GarageArea</th>\n",
       "      <th>WoodDeckSF</th>\n",
       "      <th>OpenPorchSF</th>\n",
       "      <th>EnclosedPorch</th>\n",
       "      <th>3SsnPorch</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2433.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2896.000000</td>\n",
       "      <td>2918.000000</td>\n",
       "      <td>2918.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>2918.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>57.137718</td>\n",
       "      <td>69.305795</td>\n",
       "      <td>10168.114080</td>\n",
       "      <td>6.089072</td>\n",
       "      <td>5.564577</td>\n",
       "      <td>1971.312778</td>\n",
       "      <td>1984.264474</td>\n",
       "      <td>102.201312</td>\n",
       "      <td>441.423235</td>\n",
       "      <td>49.582248</td>\n",
       "      <td>...</td>\n",
       "      <td>472.874572</td>\n",
       "      <td>93.709832</td>\n",
       "      <td>47.486811</td>\n",
       "      <td>23.098321</td>\n",
       "      <td>2.602261</td>\n",
       "      <td>16.062350</td>\n",
       "      <td>2.251799</td>\n",
       "      <td>50.825968</td>\n",
       "      <td>6.213087</td>\n",
       "      <td>2007.792737</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>42.517628</td>\n",
       "      <td>23.344905</td>\n",
       "      <td>7886.996359</td>\n",
       "      <td>1.409947</td>\n",
       "      <td>1.113131</td>\n",
       "      <td>30.291442</td>\n",
       "      <td>20.894344</td>\n",
       "      <td>179.334253</td>\n",
       "      <td>455.610826</td>\n",
       "      <td>169.205611</td>\n",
       "      <td>...</td>\n",
       "      <td>215.394815</td>\n",
       "      <td>126.526589</td>\n",
       "      <td>67.575493</td>\n",
       "      <td>64.244246</td>\n",
       "      <td>25.188169</td>\n",
       "      <td>56.184365</td>\n",
       "      <td>35.663946</td>\n",
       "      <td>567.402211</td>\n",
       "      <td>2.714762</td>\n",
       "      <td>1.314964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>20.000000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>1300.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1872.000000</td>\n",
       "      <td>1950.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2006.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>20.000000</td>\n",
       "      <td>59.000000</td>\n",
       "      <td>7478.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1953.500000</td>\n",
       "      <td>1965.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>320.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>2007.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>50.000000</td>\n",
       "      <td>68.000000</td>\n",
       "      <td>9453.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1973.000000</td>\n",
       "      <td>1993.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>368.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>480.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2008.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>70.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>11570.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2001.000000</td>\n",
       "      <td>2004.000000</td>\n",
       "      <td>164.000000</td>\n",
       "      <td>733.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>576.000000</td>\n",
       "      <td>168.000000</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>2009.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>190.000000</td>\n",
       "      <td>313.000000</td>\n",
       "      <td>215245.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>1600.000000</td>\n",
       "      <td>5644.000000</td>\n",
       "      <td>1526.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1488.000000</td>\n",
       "      <td>1424.000000</td>\n",
       "      <td>742.000000</td>\n",
       "      <td>1012.000000</td>\n",
       "      <td>508.000000</td>\n",
       "      <td>576.000000</td>\n",
       "      <td>800.000000</td>\n",
       "      <td>17000.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        MSSubClass  LotFrontage        LotArea  OverallQual  OverallCond  \\\n",
       "count  2919.000000  2433.000000    2919.000000  2919.000000  2919.000000   \n",
       "mean     57.137718    69.305795   10168.114080     6.089072     5.564577   \n",
       "std      42.517628    23.344905    7886.996359     1.409947     1.113131   \n",
       "min      20.000000    21.000000    1300.000000     1.000000     1.000000   \n",
       "25%      20.000000    59.000000    7478.000000     5.000000     5.000000   \n",
       "50%      50.000000    68.000000    9453.000000     6.000000     5.000000   \n",
       "75%      70.000000    80.000000   11570.000000     7.000000     6.000000   \n",
       "max     190.000000   313.000000  215245.000000    10.000000     9.000000   \n",
       "\n",
       "         YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1   BsmtFinSF2  \\\n",
       "count  2919.000000   2919.000000  2896.000000  2918.000000  2918.000000   \n",
       "mean   1971.312778   1984.264474   102.201312   441.423235    49.582248   \n",
       "std      30.291442     20.894344   179.334253   455.610826   169.205611   \n",
       "min    1872.000000   1950.000000     0.000000     0.000000     0.000000   \n",
       "25%    1953.500000   1965.000000     0.000000     0.000000     0.000000   \n",
       "50%    1973.000000   1993.000000     0.000000   368.500000     0.000000   \n",
       "75%    2001.000000   2004.000000   164.000000   733.000000     0.000000   \n",
       "max    2010.000000   2010.000000  1600.000000  5644.000000  1526.000000   \n",
       "\n",
       "          ...        GarageArea   WoodDeckSF  OpenPorchSF  EnclosedPorch  \\\n",
       "count     ...       2918.000000  2919.000000  2919.000000    2919.000000   \n",
       "mean      ...        472.874572    93.709832    47.486811      23.098321   \n",
       "std       ...        215.394815   126.526589    67.575493      64.244246   \n",
       "min       ...          0.000000     0.000000     0.000000       0.000000   \n",
       "25%       ...        320.000000     0.000000     0.000000       0.000000   \n",
       "50%       ...        480.000000     0.000000    26.000000       0.000000   \n",
       "75%       ...        576.000000   168.000000    70.000000       0.000000   \n",
       "max       ...       1488.000000  1424.000000   742.000000    1012.000000   \n",
       "\n",
       "         3SsnPorch  ScreenPorch     PoolArea       MiscVal       MoSold  \\\n",
       "count  2919.000000  2919.000000  2919.000000   2919.000000  2919.000000   \n",
       "mean      2.602261    16.062350     2.251799     50.825968     6.213087   \n",
       "std      25.188169    56.184365    35.663946    567.402211     2.714762   \n",
       "min       0.000000     0.000000     0.000000      0.000000     1.000000   \n",
       "25%       0.000000     0.000000     0.000000      0.000000     4.000000   \n",
       "50%       0.000000     0.000000     0.000000      0.000000     6.000000   \n",
       "75%       0.000000     0.000000     0.000000      0.000000     8.000000   \n",
       "max     508.000000   576.000000   800.000000  17000.000000    12.000000   \n",
       "\n",
       "            YrSold  \n",
       "count  2919.000000  \n",
       "mean   2007.792737  \n",
       "std       1.314964  \n",
       "min    2006.000000  \n",
       "25%    2007.000000  \n",
       "50%    2008.000000  \n",
       "75%    2009.000000  \n",
       "max    2010.000000  \n",
       "\n",
       "[8 rows x 36 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 3 变量转化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all[\"MSSubClass\"] = data_all[\"MSSubClass\"].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass_120</th>\n",
       "      <th>MSSubClass_150</th>\n",
       "      <th>MSSubClass_160</th>\n",
       "      <th>MSSubClass_180</th>\n",
       "      <th>MSSubClass_190</th>\n",
       "      <th>MSSubClass_20</th>\n",
       "      <th>MSSubClass_30</th>\n",
       "      <th>MSSubClass_40</th>\n",
       "      <th>MSSubClass_45</th>\n",
       "      <th>MSSubClass_50</th>\n",
       "      <th>MSSubClass_60</th>\n",
       "      <th>MSSubClass_70</th>\n",
       "      <th>MSSubClass_75</th>\n",
       "      <th>MSSubClass_80</th>\n",
       "      <th>MSSubClass_85</th>\n",
       "      <th>MSSubClass_90</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    MSSubClass_120  MSSubClass_150  MSSubClass_160  MSSubClass_180  \\\n",
       "Id                                                                   \n",
       "1                0               0               0               0   \n",
       "2                0               0               0               0   \n",
       "3                0               0               0               0   \n",
       "4                0               0               0               0   \n",
       "5                0               0               0               0   \n",
       "\n",
       "    MSSubClass_190  MSSubClass_20  MSSubClass_30  MSSubClass_40  \\\n",
       "Id                                                                \n",
       "1                0              0              0              0   \n",
       "2                0              1              0              0   \n",
       "3                0              0              0              0   \n",
       "4                0              0              0              0   \n",
       "5                0              0              0              0   \n",
       "\n",
       "    MSSubClass_45  MSSubClass_50  MSSubClass_60  MSSubClass_70  MSSubClass_75  \\\n",
       "Id                                                                              \n",
       "1               0              0              1              0              0   \n",
       "2               0              0              0              0              0   \n",
       "3               0              0              1              0              0   \n",
       "4               0              0              0              1              0   \n",
       "5               0              0              1              0              0   \n",
       "\n",
       "    MSSubClass_80  MSSubClass_85  MSSubClass_90  \n",
       "Id                                               \n",
       "1               0              0              0  \n",
       "2               0              0              0  \n",
       "3               0              0              0  \n",
       "4               0              0              0  \n",
       "5               0              0              0  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#One-hot编码\n",
    "pd.get_dummies(data_all[\"MSSubClass\"], prefix=\"MSSubClass\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#对所有的catagory进行编码\n",
    "data_all_dummy = pd.get_dummies(data_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>BsmtUnfSF</th>\n",
       "      <th>...</th>\n",
       "      <th>SaleType_ConLw</th>\n",
       "      <th>SaleType_New</th>\n",
       "      <th>SaleType_Oth</th>\n",
       "      <th>SaleType_WD</th>\n",
       "      <th>SaleCondition_Abnorml</th>\n",
       "      <th>SaleCondition_AdjLand</th>\n",
       "      <th>SaleCondition_Alloca</th>\n",
       "      <th>SaleCondition_Family</th>\n",
       "      <th>SaleCondition_Normal</th>\n",
       "      <th>SaleCondition_Partial</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>2003</td>\n",
       "      <td>2003</td>\n",
       "      <td>196.0</td>\n",
       "      <td>706.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>150.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>6</td>\n",
       "      <td>8</td>\n",
       "      <td>1976</td>\n",
       "      <td>1976</td>\n",
       "      <td>0.0</td>\n",
       "      <td>978.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>284.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>2001</td>\n",
       "      <td>2002</td>\n",
       "      <td>162.0</td>\n",
       "      <td>486.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>1915</td>\n",
       "      <td>1970</td>\n",
       "      <td>0.0</td>\n",
       "      <td>216.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "      <td>2000</td>\n",
       "      <td>2000</td>\n",
       "      <td>350.0</td>\n",
       "      <td>655.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>490.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 303 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \\\n",
       "Id                                                                            \n",
       "1          65.0     8450            7            5       2003          2003   \n",
       "2          80.0     9600            6            8       1976          1976   \n",
       "3          68.0    11250            7            5       2001          2002   \n",
       "4          60.0     9550            7            5       1915          1970   \n",
       "5          84.0    14260            8            5       2000          2000   \n",
       "\n",
       "    MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF          ...            \\\n",
       "Id                                                         ...             \n",
       "1        196.0       706.0         0.0      150.0          ...             \n",
       "2          0.0       978.0         0.0      284.0          ...             \n",
       "3        162.0       486.0         0.0      434.0          ...             \n",
       "4          0.0       216.0         0.0      540.0          ...             \n",
       "5        350.0       655.0         0.0      490.0          ...             \n",
       "\n",
       "    SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \\\n",
       "Id                                                            \n",
       "1                0             0             0            1   \n",
       "2                0             0             0            1   \n",
       "3                0             0             0            1   \n",
       "4                0             0             0            1   \n",
       "5                0             0             0            1   \n",
       "\n",
       "    SaleCondition_Abnorml  SaleCondition_AdjLand  SaleCondition_Alloca  \\\n",
       "Id                                                                       \n",
       "1                       0                      0                     0   \n",
       "2                       0                      0                     0   \n",
       "3                       0                      0                     0   \n",
       "4                       1                      0                     0   \n",
       "5                       0                      0                     0   \n",
       "\n",
       "    SaleCondition_Family  SaleCondition_Normal  SaleCondition_Partial  \n",
       "Id                                                                     \n",
       "1                      0                     1                      0  \n",
       "2                      0                     1                      0  \n",
       "3                      0                     1                      0  \n",
       "4                      0                     0                      0  \n",
       "5                      0                     1                      0  \n",
       "\n",
       "[5 rows x 303 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_dummy.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LotFrontage          486\n",
       "GarageYrBlt          159\n",
       "MasVnrArea            23\n",
       "BsmtHalfBath           2\n",
       "BsmtFullBath           2\n",
       "BsmtFinSF2             1\n",
       "GarageCars             1\n",
       "TotalBsmtSF            1\n",
       "BsmtUnfSF              1\n",
       "GarageArea             1\n",
       "BsmtFinSF1             1\n",
       "Condition1_Artery      0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#缺失值的填充\n",
    "data_all_dummy.isnull().sum().sort_values(ascending=False).head(12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LotFrontage        69.305795\n",
       "LotArea         10168.114080\n",
       "OverallQual         6.089072\n",
       "OverallCond         5.564577\n",
       "YearBuilt        1971.312778\n",
       "YearRemodAdd     1984.264474\n",
       "MasVnrArea        102.201312\n",
       "BsmtFinSF1        441.423235\n",
       "BsmtFinSF2         49.582248\n",
       "BsmtUnfSF         560.772104\n",
       "TotalBsmtSF      1051.777587\n",
       "1stFlrSF         1159.581706\n",
       "dtype: float64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_cols = np.mean(data_all_dummy)\n",
    "mean_cols.head(12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_dummy = data_all_dummy.fillna(mean_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>BsmtUnfSF</th>\n",
       "      <th>...</th>\n",
       "      <th>SaleType_ConLw</th>\n",
       "      <th>SaleType_New</th>\n",
       "      <th>SaleType_Oth</th>\n",
       "      <th>SaleType_WD</th>\n",
       "      <th>SaleCondition_Abnorml</th>\n",
       "      <th>SaleCondition_AdjLand</th>\n",
       "      <th>SaleCondition_Alloca</th>\n",
       "      <th>SaleCondition_Family</th>\n",
       "      <th>SaleCondition_Normal</th>\n",
       "      <th>SaleCondition_Partial</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>69.305795</td>\n",
       "      <td>10168.114080</td>\n",
       "      <td>6.089072</td>\n",
       "      <td>5.564577</td>\n",
       "      <td>1971.312778</td>\n",
       "      <td>1984.264474</td>\n",
       "      <td>102.201312</td>\n",
       "      <td>441.423235</td>\n",
       "      <td>49.582248</td>\n",
       "      <td>560.772104</td>\n",
       "      <td>...</td>\n",
       "      <td>0.002741</td>\n",
       "      <td>0.081877</td>\n",
       "      <td>0.002398</td>\n",
       "      <td>0.865022</td>\n",
       "      <td>0.065091</td>\n",
       "      <td>0.004111</td>\n",
       "      <td>0.008222</td>\n",
       "      <td>0.015759</td>\n",
       "      <td>0.822885</td>\n",
       "      <td>0.083933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>21.312345</td>\n",
       "      <td>7886.996359</td>\n",
       "      <td>1.409947</td>\n",
       "      <td>1.113131</td>\n",
       "      <td>30.291442</td>\n",
       "      <td>20.894344</td>\n",
       "      <td>178.626089</td>\n",
       "      <td>455.532750</td>\n",
       "      <td>169.176615</td>\n",
       "      <td>439.468337</td>\n",
       "      <td>...</td>\n",
       "      <td>0.052289</td>\n",
       "      <td>0.274225</td>\n",
       "      <td>0.048920</td>\n",
       "      <td>0.341758</td>\n",
       "      <td>0.246728</td>\n",
       "      <td>0.063996</td>\n",
       "      <td>0.090317</td>\n",
       "      <td>0.124562</td>\n",
       "      <td>0.381832</td>\n",
       "      <td>0.277335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>21.000000</td>\n",
       "      <td>1300.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1872.000000</td>\n",
       "      <td>1950.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>60.000000</td>\n",
       "      <td>7478.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1953.500000</td>\n",
       "      <td>1965.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>220.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>69.305795</td>\n",
       "      <td>9453.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1973.000000</td>\n",
       "      <td>1993.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>369.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>467.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>78.000000</td>\n",
       "      <td>11570.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2001.000000</td>\n",
       "      <td>2004.000000</td>\n",
       "      <td>163.500000</td>\n",
       "      <td>733.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>805.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>313.000000</td>\n",
       "      <td>215245.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>1600.000000</td>\n",
       "      <td>5644.000000</td>\n",
       "      <td>1526.000000</td>\n",
       "      <td>2336.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 303 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       LotFrontage        LotArea  OverallQual  OverallCond    YearBuilt  \\\n",
       "count  2919.000000    2919.000000  2919.000000  2919.000000  2919.000000   \n",
       "mean     69.305795   10168.114080     6.089072     5.564577  1971.312778   \n",
       "std      21.312345    7886.996359     1.409947     1.113131    30.291442   \n",
       "min      21.000000    1300.000000     1.000000     1.000000  1872.000000   \n",
       "25%      60.000000    7478.000000     5.000000     5.000000  1953.500000   \n",
       "50%      69.305795    9453.000000     6.000000     5.000000  1973.000000   \n",
       "75%      78.000000   11570.000000     7.000000     6.000000  2001.000000   \n",
       "max     313.000000  215245.000000    10.000000     9.000000  2010.000000   \n",
       "\n",
       "       YearRemodAdd   MasVnrArea   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF  \\\n",
       "count   2919.000000  2919.000000  2919.000000  2919.000000  2919.000000   \n",
       "mean    1984.264474   102.201312   441.423235    49.582248   560.772104   \n",
       "std       20.894344   178.626089   455.532750   169.176615   439.468337   \n",
       "min     1950.000000     0.000000     0.000000     0.000000     0.000000   \n",
       "25%     1965.000000     0.000000     0.000000     0.000000   220.000000   \n",
       "50%     1993.000000     0.000000   369.000000     0.000000   467.000000   \n",
       "75%     2004.000000   163.500000   733.000000     0.000000   805.000000   \n",
       "max     2010.000000  1600.000000  5644.000000  1526.000000  2336.000000   \n",
       "\n",
       "               ...            SaleType_ConLw  SaleType_New  SaleType_Oth  \\\n",
       "count          ...               2919.000000   2919.000000   2919.000000   \n",
       "mean           ...                  0.002741      0.081877      0.002398   \n",
       "std            ...                  0.052289      0.274225      0.048920   \n",
       "min            ...                  0.000000      0.000000      0.000000   \n",
       "25%            ...                  0.000000      0.000000      0.000000   \n",
       "50%            ...                  0.000000      0.000000      0.000000   \n",
       "75%            ...                  0.000000      0.000000      0.000000   \n",
       "max            ...                  1.000000      1.000000      1.000000   \n",
       "\n",
       "       SaleType_WD  SaleCondition_Abnorml  SaleCondition_AdjLand  \\\n",
       "count  2919.000000            2919.000000            2919.000000   \n",
       "mean      0.865022               0.065091               0.004111   \n",
       "std       0.341758               0.246728               0.063996   \n",
       "min       0.000000               0.000000               0.000000   \n",
       "25%       1.000000               0.000000               0.000000   \n",
       "50%       1.000000               0.000000               0.000000   \n",
       "75%       1.000000               0.000000               0.000000   \n",
       "max       1.000000               1.000000               1.000000   \n",
       "\n",
       "       SaleCondition_Alloca  SaleCondition_Family  SaleCondition_Normal  \\\n",
       "count           2919.000000           2919.000000           2919.000000   \n",
       "mean               0.008222              0.015759              0.822885   \n",
       "std                0.090317              0.124562              0.381832   \n",
       "min                0.000000              0.000000              0.000000   \n",
       "25%                0.000000              0.000000              1.000000   \n",
       "50%                0.000000              0.000000              1.000000   \n",
       "75%                0.000000              0.000000              1.000000   \n",
       "max                1.000000              1.000000              1.000000   \n",
       "\n",
       "       SaleCondition_Partial  \n",
       "count            2919.000000  \n",
       "mean                0.083933  \n",
       "std                 0.277335  \n",
       "min                 0.000000  \n",
       "25%                 0.000000  \n",
       "50%                 0.000000  \n",
       "75%                 0.000000  \n",
       "max                 1.000000  \n",
       "\n",
       "[8 rows x 303 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_dummy.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',\n",
       "       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',\n",
       "       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',\n",
       "       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',\n",
       "       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',\n",
       "       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',\n",
       "       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',\n",
       "       'MoSold', 'YrSold'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#对numerical值进行标准化\n",
    "numerical_cols = data_all.columns[data_all.dtypes != 'object']\n",
    "numerical_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>BsmtUnfSF</th>\n",
       "      <th>...</th>\n",
       "      <th>SaleType_ConLw</th>\n",
       "      <th>SaleType_New</th>\n",
       "      <th>SaleType_Oth</th>\n",
       "      <th>SaleType_WD</th>\n",
       "      <th>SaleCondition_Abnorml</th>\n",
       "      <th>SaleCondition_AdjLand</th>\n",
       "      <th>SaleCondition_Alloca</th>\n",
       "      <th>SaleCondition_Family</th>\n",
       "      <th>SaleCondition_Normal</th>\n",
       "      <th>SaleCondition_Partial</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.202033</td>\n",
       "      <td>-0.217841</td>\n",
       "      <td>0.646073</td>\n",
       "      <td>-0.507197</td>\n",
       "      <td>1.046078</td>\n",
       "      <td>0.896679</td>\n",
       "      <td>0.525112</td>\n",
       "      <td>0.580807</td>\n",
       "      <td>-0.29308</td>\n",
       "      <td>-0.934702</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.501785</td>\n",
       "      <td>-0.072032</td>\n",
       "      <td>-0.063174</td>\n",
       "      <td>2.187904</td>\n",
       "      <td>0.154737</td>\n",
       "      <td>-0.395536</td>\n",
       "      <td>-0.572152</td>\n",
       "      <td>1.177910</td>\n",
       "      <td>-0.29308</td>\n",
       "      <td>-0.629788</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.061269</td>\n",
       "      <td>0.137173</td>\n",
       "      <td>0.646073</td>\n",
       "      <td>-0.507197</td>\n",
       "      <td>0.980053</td>\n",
       "      <td>0.848819</td>\n",
       "      <td>0.334770</td>\n",
       "      <td>0.097856</td>\n",
       "      <td>-0.29308</td>\n",
       "      <td>-0.288467</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.436639</td>\n",
       "      <td>-0.078371</td>\n",
       "      <td>0.646073</td>\n",
       "      <td>-0.507197</td>\n",
       "      <td>-1.859033</td>\n",
       "      <td>-0.682695</td>\n",
       "      <td>-0.572152</td>\n",
       "      <td>-0.494856</td>\n",
       "      <td>-0.29308</td>\n",
       "      <td>-0.047266</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.689469</td>\n",
       "      <td>0.518814</td>\n",
       "      <td>1.355319</td>\n",
       "      <td>-0.507197</td>\n",
       "      <td>0.947040</td>\n",
       "      <td>0.753100</td>\n",
       "      <td>1.387248</td>\n",
       "      <td>0.468851</td>\n",
       "      <td>-0.29308</td>\n",
       "      <td>-0.161040</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 303 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \\\n",
       "Id                                                                             \n",
       "1     -0.202033 -0.217841     0.646073    -0.507197   1.046078      0.896679   \n",
       "2      0.501785 -0.072032    -0.063174     2.187904   0.154737     -0.395536   \n",
       "3     -0.061269  0.137173     0.646073    -0.507197   0.980053      0.848819   \n",
       "4     -0.436639 -0.078371     0.646073    -0.507197  -1.859033     -0.682695   \n",
       "5      0.689469  0.518814     1.355319    -0.507197   0.947040      0.753100   \n",
       "\n",
       "    MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF          ...            \\\n",
       "Id                                                         ...             \n",
       "1     0.525112    0.580807    -0.29308  -0.934702          ...             \n",
       "2    -0.572152    1.177910    -0.29308  -0.629788          ...             \n",
       "3     0.334770    0.097856    -0.29308  -0.288467          ...             \n",
       "4    -0.572152   -0.494856    -0.29308  -0.047266          ...             \n",
       "5     1.387248    0.468851    -0.29308  -0.161040          ...             \n",
       "\n",
       "    SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \\\n",
       "Id                                                            \n",
       "1                0             0             0            1   \n",
       "2                0             0             0            1   \n",
       "3                0             0             0            1   \n",
       "4                0             0             0            1   \n",
       "5                0             0             0            1   \n",
       "\n",
       "    SaleCondition_Abnorml  SaleCondition_AdjLand  SaleCondition_Alloca  \\\n",
       "Id                                                                       \n",
       "1                       0                      0                     0   \n",
       "2                       0                      0                     0   \n",
       "3                       0                      0                     0   \n",
       "4                       1                      0                     0   \n",
       "5                       0                      0                     0   \n",
       "\n",
       "    SaleCondition_Family  SaleCondition_Normal  SaleCondition_Partial  \n",
       "Id                                                                     \n",
       "1                      0                     1                      0  \n",
       "2                      0                     1                      0  \n",
       "3                      0                     1                      0  \n",
       "4                      0                     0                      0  \n",
       "5                      0                     1                      0  \n",
       "\n",
       "[5 rows x 303 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numerical_means = data_all_dummy.loc[:, numerical_cols].mean()\n",
    "numerical_std = data_all_dummy.loc[:, numerical_cols].std()\n",
    "data_all_dummy.loc[:, numerical_cols] = (data_all_dummy.loc[:, numerical_cols]-numerical_means)/numerical_std\n",
    "data_all_dummy.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
